{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 基于用户的协同过滤"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导入工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "#load数据（用户和物品索引，以及倒排表）\n",
    "import pickle\n",
    "\n",
    "#稀疏矩阵，打分表\n",
    "import scipy.io as sio\n",
    "import os\n",
    "\n",
    "#距离\n",
    "import scipy.spatial.distance as ssd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读入训练数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#用户和item的索引\n",
    "users_index = pickle.load(open(\"users_index.pkl\", 'rb'))\n",
    "items_index = pickle.load(open(\"items_index.pkl\", 'rb'))\n",
    "\n",
    "n_users = len(users_index)\n",
    "n_items = len(items_index)\n",
    "    \n",
    "#倒排表\n",
    "##每个用户打过分的电影\n",
    "user_items = pickle.load(open(\"user_items.pkl\", 'rb'))\n",
    "##对每个电影打过分的事用户\n",
    "item_users = pickle.load(open(\"item_users.pkl\", 'rb'))\n",
    "\n",
    "#用户-物品关系矩阵R\n",
    "user_item_scores = sio.mmread(\"user_item_scores\")#.todense()\n",
    "user_item_scores = user_item_scores.tocsr()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 计算每个用户的平均打分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "users_mu = np.zeros(n_users)\n",
    "for u in range(n_users):  \n",
    "    n_user_items = 0\n",
    "    r_acc = 0.0\n",
    "    \n",
    "    for i in user_items[u]:  #用户打过分的item\n",
    "        r_acc += user_item_scores[u,i]\n",
    "        n_user_items += 1\n",
    " \n",
    "    users_mu[u] = r_acc/n_user_items"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 计算两个用户之间的相似度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def user_similarity(uid1, uid2 ):\n",
    "    si={}  #有效item（两个用户均有打分的item）的集合\n",
    "    for item in user_items[uid1]:  #uid1所有打过分的Item1\n",
    "        if item in user_items[uid2]:  #如果uid2也对该Item打过分\n",
    "            si[item]=1  #item为一个有效item\n",
    "        \n",
    "    n=len(si)   #有效item数，有效item为即对uid对Item打过分，uid2也对Item打过分\n",
    "    if (n==0):  #没有共同打过分的item，相似度设为0？\n",
    "        similarity=0.0  \n",
    "        return similarity  \n",
    "        \n",
    "    #用户uid1的有效打分(减去该用户的平均打分)\n",
    "    s1=np.array([user_item_scores[uid1,item]-users_mu[uid1] for item in si])  \n",
    "        \n",
    "    #用户uid2的有效打分(减去该用户的平均打分)\n",
    "    s2=np.array([user_item_scores[uid2,item]-users_mu[uid2] for item in si])  \n",
    "        \n",
    "    similarity = 1 - ssd.cosine(s1, s2) \n",
    "    \n",
    "    if np.isnan(similarity): #s1或s2的l2模为0（全部等于该用户的平均打分）\n",
    "        similarity = 0.0\n",
    "    return similarity  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 预计算好所有用户之间的相似性\n",
    "对用户比较少、用户比较固定的的系统适用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ui=0 \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Program Files\\Anaconda\\lib\\site-packages\\scipy\\spatial\\distance.py:702: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  dist = 1.0 - uv / np.sqrt(uu * vv)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ui=100 \n",
      "ui=200 \n",
      "ui=300 \n",
      "ui=400 \n",
      "ui=500 \n",
      "ui=600 \n",
      "ui=700 \n",
      "ui=800 \n",
      "ui=900 \n"
     ]
    }
   ],
   "source": [
    "users_similarity_matrix = np.matrix(np.zeros(shape=(n_users, n_users)), float)\n",
    "\n",
    "for ui in range(n_users):\n",
    "    users_similarity_matrix[ui,ui] = 1.0\n",
    "    \n",
    "    #打印进度条\n",
    "    if(ui % 100 == 0):\n",
    "        print (\"ui=%d \" % (ui))\n",
    "\n",
    "    for uj in range(ui+1,n_users):   \n",
    "        users_similarity_matrix[uj,ui] = user_similarity(ui, uj)\n",
    "        users_similarity_matrix[ui,uj] = users_similarity_matrix[uj,ui]\n",
    "\n",
    "pickle.dump(users_similarity_matrix, open(\"users_similarity.pkl\", 'wb')) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def users_similarity(n_users ):\n",
    "    users_similarity_matrix = np.matrix(np.zeros(shape=(n_users, n_users)), float)\n",
    "\n",
    "    for ui in range(n_users):\n",
    "        users_similarity_matrix[ui,ui] = 1.0\n",
    "    \n",
    "        #打印进度条\n",
    "        if(ui % 100 == 0):\n",
    "            print (\"ui=:%d \" % (ui))\n",
    "\n",
    "        for uj in range(ui+1,n_users):   \n",
    "            users_similarity_matrix[uj,ui] = user_similarity(ui, uj)\n",
    "            users_similarity_matrix[ui,uj] = users_similarity_matrix[uj,ui]\n",
    "\n",
    "    pickle.dump(users_similarity_matrix, open(\"users_similarity.pkl\", 'wb')) \n",
    "    return users_similarity_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#所有用户之间的相似度\n",
    "#users_similarity_matrix = pickle.load(open(\"users_similarity.pkl\", 'rb'))\n",
    "#users_similarity_matrix = users_similarity(n_users)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 测试"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 预测用户对item的打分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "### 预测用户对item的打分\n",
    "def User_CF_pred(uid, iid): \n",
    "    sim_accumulate=0.0  \n",
    "    rat_acc=0.0 \n",
    "    for user_id in item_users[iid]:  #对item iid打过分的所有用户\n",
    "        #计算当前用户与给item i打过分的用户之间的相似度\n",
    "        #sim = user_similarity(user_id, uid)\n",
    "        sim = users_similarity_matrix[user_id,uid]\n",
    "            \n",
    "        if sim != 0: \n",
    "            rat_acc += sim * (user_item_scores[user_id,iid] - users_mu[user_id])   #用户user对item i的打分\n",
    "            sim_accumulate += np.abs(sim)  \n",
    "        \n",
    "    if sim_accumulate != 0:  \n",
    "        score = users_mu[uid] + rat_acc/sim_accumulate\n",
    "    else: #no similar users,return average rates of the user \n",
    "        score = users_mu[uid]\n",
    "    \n",
    "    return score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对给定用户，推荐物品/计算打分\n",
    "不同的推荐算法，只是预测打分函数不同，\n",
    "user_items_scores[i] = User_CF_pred(cur_user_id, i)  #预测打分\n",
    "\n",
    "如User_CF_pred, Item_CF_pred, svd_CF_pred,...\n",
    "甚至基于内容的推荐也是一样。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#user：用户\n",
    "#返回推荐items及其打分（DataFrame）\n",
    "def recommend(user):\n",
    "    cur_user_id = users_index[user]\n",
    "    \n",
    "    #训练集中该用户打过分的item\n",
    "    cur_user_items = user_items[cur_user_id]\n",
    "\n",
    "    #该用户对所有item的打分\n",
    "    user_items_scores = np.zeros(n_items)\n",
    "\n",
    "    #预测打分\n",
    "    for i in range(n_items):  # all items \n",
    "        if i not in cur_user_items: #训练集中没打过分\n",
    "            user_items_scores[i] = User_CF_pred(cur_user_id, i)  #预测打分\n",
    "    \n",
    "    #推荐\n",
    "    #Sort the indices of user_item_scores based upon their value，Also maintain the corresponding score\n",
    "    sort_index = sorted(((e,i) for i,e in enumerate(list(user_items_scores))), reverse=True)\n",
    "    \n",
    "    #Create a dataframe from the following\n",
    "    columns = ['item_id', 'score']\n",
    "    df = pd.DataFrame(columns=columns)\n",
    "         \n",
    "    #Fill the dataframe with top 20 (n_rec_items) item based recommendations\n",
    "    #sort_index = sort_index[0:n_rec_items]\n",
    "    #Fill the dataframe with all items based recommendations\n",
    "    for i in range(0,len(sort_index)):\n",
    "        cur_item_index = sort_index[i][1] \n",
    "        cur_item = list (items_index.keys()) [list (items_index.values()).index (cur_item_index)]\n",
    "            \n",
    "        if ~np.isnan(sort_index[i][0]) and cur_item_index not in cur_user_items:\n",
    "            df.loc[len(df)]=[cur_item, sort_index[i][0]]\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取测试数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取测试数据\n",
    "triplet_cols = ['user_id','item_id', 'rating', 'timestamp'] \n",
    "\n",
    "dpath = './data/'\n",
    "df_triplet_test = pd.read_csv(dpath +'u1.test', sep='\\t', names=triplet_cols, encoding='latin-1')\n",
    "#df_triplet_test.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 测试，并计算评价指标\n",
    "PR、覆盖度、RMSE\n",
    "这部分代码所有的推荐算法相同\n",
    "\n",
    "令系统的用户集合为 U， R(u) 是根据用户在训练集上的行为给用户作出的推荐列表，而 T(u) 是用户在测试集上的行为列表。那么推荐结果的准确率定义为：\n",
    "$$\n",
    "Precision=\\frac{\\sum_{u\\in U}|R(u)\\cap T(u)|}{\\sum_{u\\in U}|R(u)|}\n",
    "$$\n",
    "推荐结果的召回率定义为：\n",
    "$$\n",
    "Recall=\\frac{\\sum_{u\\in U}|R(u)\\cap T(u)|}{\\sum_{u\\in U}|T(u)|}\n",
    "$$\n",
    "\n",
    "推荐系统的覆盖率为：\n",
    "$$\n",
    "Coverage=\\frac{\\sum_{u\\in U}|R(u)|}{|I|}\n",
    "$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "599 is a new item.\n",
      "\n",
      "711 is a new item.\n",
      "\n",
      "814 is a new item.\n",
      "\n",
      "830 is a new item.\n",
      "\n",
      "852 is a new item.\n",
      "\n",
      "857 is a new item.\n",
      "\n",
      "1156 is a new item.\n",
      "\n",
      "1236 is a new item.\n",
      "\n",
      "1309 is a new item.\n",
      "\n",
      "1310 is a new item.\n",
      "\n",
      "1320 is a new item.\n",
      "\n",
      "1343 is a new item.\n",
      "\n",
      "1348 is a new item.\n",
      "\n",
      "1364 is a new item.\n",
      "\n",
      "1373 is a new item.\n",
      "\n",
      "1457 is a new item.\n",
      "\n",
      "1458 is a new item.\n",
      "\n",
      "1492 is a new item.\n",
      "\n",
      "1493 is a new item.\n",
      "\n",
      "1498 is a new item.\n",
      "\n",
      "1505 is a new item.\n",
      "\n",
      "1520 is a new item.\n",
      "\n",
      "1533 is a new item.\n",
      "\n",
      "1536 is a new item.\n",
      "\n",
      "1543 is a new item.\n",
      "\n",
      "1557 is a new item.\n",
      "\n",
      "1561 is a new item.\n",
      "\n",
      "1562 is a new item.\n",
      "\n",
      "1563 is a new item.\n",
      "\n",
      "1565 is a new item.\n",
      "\n",
      "1582 is a new item.\n",
      "\n",
      "1586 is a new item.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#统计总的用户\n",
    "unique_users_test = df_triplet_test['user_id'].unique()\n",
    "\n",
    "#为每个用户推荐的item的数目\n",
    "n_rec_items = 20\n",
    "\n",
    "#性能评价参数初始化，用户计算Percison和Recall\n",
    "n_hits = 0\n",
    "n_total_rec_items = 0\n",
    "n_test_items = 0\n",
    "\n",
    "#所有被推荐商品的集合（对不同用户），用于计算覆盖度\n",
    "all_rec_items = set()\n",
    "\n",
    "#残差平方和，用与计算RMSE\n",
    "rss_test = 0.0\n",
    "\n",
    "#对每个测试用户\n",
    "for user in unique_users_test:\n",
    "    #测试集中该用户打过分的电影（用于计算评价指标的真实值）\n",
    "    if user not in users_index:   #user在训练集中没有出现过，新用户不能用协同过滤\n",
    "        print(str(user) + ' is a new user.\\n')\n",
    "        continue\n",
    "   \n",
    "    user_records_test= df_triplet_test[df_triplet_test.user_id == user]\n",
    "    \n",
    "    #对每个测试用户，计算该用户对训练集中未出现过的商品的打分，并基于该打分进行推荐（top n_rec_items）\n",
    "    #返回结果为DataFrame\n",
    "    rec_items = recommend(user)\n",
    "    \n",
    "    for i in range(n_rec_items):\n",
    "        item = rec_items.iloc[i]['item_id']\n",
    "        \n",
    "        if item in user_records_test['item_id'].values:\n",
    "            n_hits += 1\n",
    "        all_rec_items.add(item)\n",
    "    \n",
    "    #计算rmse\n",
    "    for i in range(user_records_test.shape[0]):\n",
    "        item = user_records_test.iloc[i]['item_id']\n",
    "        score = user_records_test.iloc[i]['rating']\n",
    "        \n",
    "        df1 = rec_items[rec_items.item_id == item]\n",
    "        if(df1.shape[0] == 0): #item在训练集中没有出现过，新item不能被协同过滤推荐\n",
    "            print(str(item) + ' is a new item.\\n')\n",
    "            continue\n",
    "        pred_score = df1['score'].values[0]\n",
    "        rss_test += (pred_score - score)**2     #残差平方和\n",
    "    \n",
    "    #推荐的item总数\n",
    "    n_total_rec_items += n_rec_items\n",
    "    \n",
    "    #真实item的总数\n",
    "    n_test_items += user_records_test.shape[0]\n",
    "\n",
    "#Precision & Recall\n",
    "precision = n_hits / (1.0*n_total_rec_items)\n",
    "recall = n_hits / (1.0*n_test_items)\n",
    "\n",
    "#覆盖度：推荐商品占总需要推荐商品的比例\n",
    "coverage = len(all_rec_items) / (1.0* n_items)\n",
    "\n",
    "#打分的均方误差\n",
    "rmse=np.sqrt(rss_test / df_triplet_test.shape[0])  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 推荐数目改为20后的评价指标"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.001851851851851852"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "precision"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.00085"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "recall"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.17454545454545456"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "coverage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9658669077094278"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rmse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
